clear all
set more off
set maxvar 10000
set rngstate $rdmseed


	use "$Mydirectory1/3_Output/2_PooledData_analysis.dta", clear 
    keep if baseline_sample==1
	
    keep hhid id_within_survey id_all_survey fatheroccej baseline_sample ///  
         father_income_baseline fam_inc_real log_son_baseline ///
         log_father_baseline rank_son_baseline rank_father_baseline ///
         weight_center wgt_sex_race year survey_year dob data decade ///
         age agesq race sex black sib_psid1997 sib_psid2017 ///
         sib_interviewed_1979 sib_nlsyw68 sib_nlsym66 idcode* RELC* ///
         IDC* origid_nlsy

*---------------------------------------------------------------------------*
*---------------------------------------------------------------------------*

********************************************************
* STEP 1: MAKE SIBLING IDENTIFIERS, BY SURVEY 
********************************************************

*-----------------------------*
* NLSYM66 AND NLSYW68
*-----------------------------*

    preserve 

        keep if data=="nlsym66" | data=="nlsyw68"

        keep hhid id_within_survey id_all_survey idcode_* fatheroccej

        sort hhid id_within_survey 
        order hhid id_within_survey

        by hhid: gen pos_inhh = _n
        by hhid: egen hhidsize = max(pos_inhh)
        tab hhidsize,m

        * Dummy: respondent has a sibling in the NLSYM or NLSYW surveys
        gen sib_nlsyoungpeeps =.

        lookfor idcode_ 
        local vars "`r(varlist)'"
        di "`vars'"

    * Look for siblings
     
        //2-person households
        foreach var of local vars {
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+1]==`var' & !inlist(`var',-4,.) & hhidsize==2 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-1]==`var' & !inlist(`var',-4,.) & hhidsize==2 & sib_nlsyoungpeeps==. 
        }
     
        //3-person households
        foreach var of local vars {
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+1]==`var' & !inlist(`var',-4,.) & hhidsize==3 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-1]==`var' & !inlist(`var',-4,.) & hhidsize==3 & sib_nlsyoungpeeps==. 

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+2]==`var' & !inlist(`var',-4,.) & hhidsize==3 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-2]==`var' & !inlist(`var',-4,.) & hhidsize==3 & sib_nlsyoungpeeps==.     
        }

        //4-person households
        foreach var of local vars {
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+1]==`var' & !inlist(`var',-4,.) & hhidsize==4 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-1]==`var' & !inlist(`var',-4,.) & hhidsize==4 & sib_nlsyoungpeeps==. 

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+2]==`var' & !inlist(`var',-4,.) & hhidsize==4 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-2]==`var' & !inlist(`var',-4,.) & hhidsize==4 & sib_nlsyoungpeeps==.   

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+3]==`var' & !inlist(`var',-4,.) & hhidsize==4 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-3]==`var' & !inlist(`var',-4,.) & hhidsize==4 & sib_nlsyoungpeeps==.        
        }

        //5-person households
        foreach var of local vars {
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+1]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-1]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==. 

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+2]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-2]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==.   

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+3]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-3]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==.  

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+4]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-4]==`var' & !inlist(`var',-4,.) & hhidsize==5 & sib_nlsyoungpeeps==.  
        }

        //6-person households
        foreach var of local vars {
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+1]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-1]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==. 

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+2]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-2]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==.   

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+3]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-3]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==.  

            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+4]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-4]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==.  
        
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n+5]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==. 
            by hhid: replace sib_nlsyoungpeeps = 1 if id_within_survey[_n-5]==`var' & !inlist(`var',-4,.) & hhidsize==6 & sib_nlsyoungpeeps==.  
        }

        replace sib_nlsyoungpeeps =0 if sib_nlsyoungpeeps==.
        tab sib_nlsyoungpeeps, m

    * Total # of siblings per household
        by hhid: egen numsibs_perhh_nlsyoungpeeps= total(sib_nlsyoungpeeps==1)

    * Give a sibling set the same id
        by hhid: gen chosen_sib = (_n==1 & sib_nlsyoungpeeps==1)
        gen temp = id_all_survey if chosen_sib==1
        by hhid: egen id_chosensib = max(temp) if sib_nlsyoungpeeps==1

        gen sibsetid_nlsyoungpeeps =.
            replace sibsetid_nlsyoungpeeps = id_all_survey if sib_nlsyoungpeeps==0 
            replace sibsetid_nlsyoungpeeps = id_chosensib if sib_nlsyoungpeeps==1
        tab sibsetid_nlsyoungpeeps,m

        drop temp *chosen*

    * Find share of sibling sets that agree on occupation of their father
        by hhid: gen recall = 1 if (fatheroccej==fatheroccej[_n+1] | fatheroccej==fatheroccej[_n-1]) & sib_nlsyoungpeeps==1
        replace recall =0 if recall==.
        tab recall if sib_nlsyoungpeeps==1, m

    /* Make dummy for consistent father occ recall that is consistent 
       across all siblings in a set */
        by hhid: egen good_recall = min(recall) if sib_nlsyoungpeeps==1
        tab good_recall if sib_nlsyoungpeeps==1, m

    *Save
        keep id_all_surveys sib_nlsyoungpeeps sibsetid_nlsyoungpeeps numsibs_perhh_nlsyoungpeeps

        tempfile nls_youngpeeps_sibs
        save `nls_youngpeeps_sibs'

    restore 

*-----------------*
*-----------------*

*-----------------------------*
* NLSY79
*-----------------------------*

    preserve 

        keep if data=="nlsy79"
        keep hhid origid_nlsy id_all_survey IDC* RELC* fatheroccej

        sort hhid origid_nlsy 
        order hhid origid_nlsy

        by hhid: gen pos_inhh = _n
        by hhid: egen hhidsize = max(pos_inhh)
        tab hhidsize,m

    * Dummy: respondent has a sibling in the NLSY79
        gen sib_nlsy79 =.

        forvalues l = 1/5  {

            local cond1 "inlist(RELC`l'_1979,6,7,39,40,59,60,64,65)"
            local condplus1_a "origid_nlsy[_n+1]==IDC`l'_1979"
            local condplus1_b "origid_nlsy[_n-1]==IDC`l'_1979"
            local condplus2_a "origid_nlsy[_n+2]==IDC`l'_1979"
            local condplus2_b "origid_nlsy[_n-2]==IDC`l'_1979"
            local condplus3_a "origid_nlsy[_n+3]==IDC`l'_1979"
            local condplus3_b "origid_nlsy[_n-3]==IDC`l'_1979"
            local condplus4_a "origid_nlsy[_n+4]==IDC`l'_1979"
            local condplus4_b "origid_nlsy[_n-4]==IDC`l'_1979"
            local cond3 "!inlist(RELC`l'_1979,-4,-3,-1,.)"
            local cond4 "!inlist(IDC`l'_1979,-4,-3,-1,.)"

            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus1_a' & `cond3' & `cond4' & inlist(hhidsize,2,3,4,5) & sib_nlsy79==. 
            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus1_b' & `cond3' & `cond4' & inlist(hhidsize,2,3,4,5) & sib_nlsy79==. 

            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus2_a' & `cond3' & `cond4' & inlist(hhidsize,3,4,5) & sib_nlsy79==. 
            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus2_b' & `cond3' & `cond4' & inlist(hhidsize,3,4,5) & sib_nlsy79==. 

            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus3_a' & `cond3' & `cond4' & inlist(hhidsize,4,5) & sib_nlsy79==. 
            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus3_a' & `cond3' & `cond4' & inlist(hhidsize,4,5) & sib_nlsy79==.        

            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus4_a' & `cond3' & `cond4' & hhidsize==5 & sib_nlsy79==. 
            by hhid: replace sib_nlsy79 = 1 if `cond1' & `condplus4_b' & `cond3' & `cond4' & hhidsize==5 & sib_nlsy79==.  
        }
        

        replace sib_nlsy79 =0 if sib_nlsy79==.
        tab sib_nlsy79, m

    * Total # of siblings per household
        by hhid: egen numsibs_perhh_nlsy79= total(sib_nlsy79==1)

    * Give a sibling set the same id
        by hhid: gen chosen_sib = (_n==1 & sib_nlsy79==1)
        gen temp = id_all_survey if chosen_sib==1
        by hhid: egen id_chosensib = max(temp) if sib_nlsy79==1

        gen sibsetid_nlsy79 =.
            replace sibsetid_nlsy79 = id_all_survey if sib_nlsy79==0 
            replace sibsetid_nlsy79 = id_chosensib if sib_nlsy79==1
        tab sibsetid_nlsy79,m

    * Find share of sibling sets that agree on occupation of their father
        by hhid: gen recall = 1 if (fatheroccej==fatheroccej[_n+1] | fatheroccej==fatheroccej[_n-1]) & sib_nlsy79==1
        replace recall =0 if recall==.
        tab recall if sib_nlsy79==1, m

    /* Make dummy for consistent father occ recall that is consistent 
       across all siblings in a set */
        by hhid: egen good_recall = min(recall) if sib_nlsy79==1
        tab good_recall if sib_nlsy79==1, m
    
    *Save  
        keep id_all_surveys sib_nlsy79 sibsetid_nlsy79 numsibs_perhh_nlsy79

        tempfile nlsy79_sibs
        save `nlsy79_sibs'

    restore

*-----------------*
*-----------------*

*-----------------------------*
* PSID (1997 AND 2017)
*-----------------------------*

    preserve

        keep if data=="psid" //1997 or 2017 wave
        keep hhid id_within_survey id_all_survey fatheroccej year data

    * Create unique identifier
        clonevar son_id = id_within_survey if (year==1997 | year==2017) & data=="psid" 
    
    * Merge in father ids   
        merge 1:1 son_id using "$PSID/FIMS/FIMSFathersKids_SRC_SEO.dta" 

        drop if _merge==2
        drop _merge

        sort father_id id_within_survey 
        order father_id id_within_survey 

    * Find total # of siblings per father id
        gen number =1
        by father_id: egen Rs_perdadid = total(number ==1) if father_id!=.
        tab Rs_perdadid, m

    * Give a sibling set the same id
        by father_id: gen chosen_sib = (_n==1 & Rs_perdadid>1 & Rs_perdadid<.)
        gen temp = id_all_survey if chosen_sib==1
        by father_id: egen id_chosensib = max(temp) if Rs_perdadid>1 & Rs_perdadid<.

        gen sibsetid_psid =.
        replace sibsetid_psid = id_all_survey if inlist(Rs_perdadid,1,.) 
        replace sibsetid_psid = id_chosensib if Rs_perdadid>1 & Rs_perdadid<.

    * Dummy: respondent has a sibling in the psid 
        gen sib_psid = 1 if Rs_perdadid>1 & Rs_perdadid<.
        replace sib_psid = 0 if Rs_perdadid==1
        tab sib_psid, m

    * Find share of sibling sets that agree on occupation of their father
        by father_id: gen recall = 1 if (fatheroccej==fatheroccej[_n+1] | fatheroccej==fatheroccej[_n-1]) & sib_psid==1
        replace recall =0 if recall==.
        tab recall if sib_psid==1, m

    /* Make dummy for consistent father occ recall that is consistent 
       across all siblings in a set */
        by father_id: egen good_recall = min(recall) if sib_psid==1
        tab good_recall if sib_psid==1, m

    *Save
        ren Rs_perdadid numsibs_perhh_psid
        replace numsibs_perhh_psid =0 if numsibs_perhh_psid==1 //makes var consistent with other surveys

        keep id_all_surveys sib_psid father_id sibsetid_psid numsibs_perhh_psid

        tempfile psid_sibs
        save `psid_sibs'

    restore

* Merge surveys with siblings
    merge 1:1 id_all_survey using `nls_youngpeeps_sibs'
    drop _merge

    merge 1:1 id_all_survey using `nlsy79_sibs' 
    drop _merge

    merge 1:1 id_all_survey using `psid_sibs'
    drop _merge

* Harmonized dummy for sibling status
    gen sib =.
    replace sib = 1 if sib_psid==1 | sib_nlsy79==1 | sib_nlsyoungpeeps==1
    replace sib = 0 if sib==.
    tab sib, m

* Harmonized variable for sibling id
    gen id_sibs =.
    replace id_sibs = sibsetid_psid if data=="psid"
    replace id_sibs = sibsetid_nlsy79 if data=="nlsy79"
    replace id_sibs = sibsetid_nlsyoungpeeps if (data =="nlsyw68" | data =="nlsym66")

* Harmonized variable for # of siblings in household
    gen numsibs_perhh =.
    replace numsibs_perhh = numsibs_perhh_psid if data=="psid"
    replace numsibs_perhh = numsibs_perhh_nlsy79 if data=="nlsy79"
    replace numsibs_perhh = numsibs_perhh_nlsyoungpeeps if (data =="nlsyw68" | data =="nlsym66")
    replace numsibs_perhh =0 if numsibs_perhh==. & data!="psid" //sometimes father id is missing in psid
    tab numsibs_perhh, m 

********************************************************
* STEP 2: MAKE SCATTERPLOT
********************************************************
    preserve

        keep if sib==1 & numsibs_perhh==2
        keep id_sibs fatheroccej father_income_baseline //Note: id_sibs is the sibling set identifier.
        drop if id_sibs==. //Note: no cases of 1 sibling missing an id.

        order id_sibs fatheroccej father_income_baseline

        sort id_sibs
        by id_sibs: gen n = _n
        tab n, m //at most 8 sibs

    * Reshape wide to get one record per sibling id
        reshape wide fatheroccej father_income_baseline, i(id_sibs) j(n)

        pwcorr father_income_baseline1 father_income_baseline2
        local corr1 = `r(rho)'
        local number: display %04.2fc `corr1'   
        
    * Scatterplot
        #delimit ;
            twoway (scatter father_income_baseline2 father_income_baseline1, msymbol(circle_hollow) mcolor(gs12)) (function y = x, ra(father_income_baseline1) clpat(dash_dot) lcolor(ebblue*.75) lwidth(medthick)), 
            xti(" " "Logged pred. parental income, first sibling") yti("Logged pred. parental income, second sibling" " ") 
            legend(off) xscale(range(7 10.5)) xlabel(7(1)10.5) yscale(range(7 10.5)) ylabel(7(1)10.5) 
            ttext(9.5 7.75 "Correlation: `number'", size(medsmall)) ;
        #delimit cr
        graph export "$Mydirectory2/appendix_c/scatterplot_2sibsample.pdf", as(pdf) replace

    restore
